librarian::shelf(
caret, dismo, dplyr, DT, GADMTools, GGally, ggplot2, here, htmltools, leaflet, mapview, maptools, purrr, ranger, raster, readr, rgbif, rgdal, rJava, rpart, rpart.plot, rsample, pdp, sdmpredictors, skimr, sf, spocc, tidyr, usdm, vip)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = FALSE)
# set random seed for reproducibility
set.seed(42)
ggplot2::theme_set(ggplot2::theme_light())
# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = T)
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_geo <- file.path(dir_data, "pts.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds <- file.path(dir_data, "mdl_maxent_vif.rds")
redo <- TRUE
This machine learning analysis was completed as an assignment for my Master’s program course, Environmental Data Science 232: Machine Learning. It was assigned by our professor, Dr. Ben Best, as an introduction to machine learning by predicting presence of a chosen species from observations and environmental data found on the site iNaturalist. It follows guidance found at Species distribution modeling | R Spatial.
My chosen species is coyote brush (Baccharis pilularis). Baccharis pilularis is native to the west coast of the United States (Oregon, California, and Baja California, Mexico). It is a shurb in the Asteraceae (Sunflower) family with oblanceolate to obovate toothed leaves, panicle-like inflorescence with staminate flowers that when mature mimic snow, and generally sticky (not a pun) (Bogler 2012).
Baccharis pilularis Image Credit: CalScape
The first step in this machine learning excercise is to download observation data of Baccharis pilularis from the Global Biodiversity Information Facility site.
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- TRUE
if (!file.exists(obs_geo) | redo){
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
query = 'Baccharis pilularis',
from = 'gbif',
has_coords = T,
limit = 10000))
# extract data frame from result
df <- res$gbif$data[[1]]
readr::write_csv(df, obs_csv)
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 10000
# show points on map
mapview::mapview(obs, map.types = "CartoDB.Voyager")
obs$key <- as.factor(obs$key)
# count number of observations
obs_num <- nrow(obs)
# Check for duplicates - creates a vector of T or F for each of the points ???should you use 'key' vs 'geom'???
dups <- duplicated(obs$key)
# how many duplicates were there? This will sum only the TRUE values
sum(dups)
## [1] 0
# create lon and lat columns in preparation to clean inaccurate data points
obs <- obs %>%
dplyr::mutate(lon = sf::st_coordinates(.)[,1],
lat = sf::st_coordinates(.)[,2])
usa <- gadm_sf_loadCountries("USA", level = 2, basefile = "data/")
There are 10000` observations for Baccharis pilularis in this data. According to the iNaturalist site, over 19,000 observations have been uploaded of this species.
There were only a few observably inaccurate data points for this species.
The next step is to use the Species Distribution Model predictors R package sdmpredictors to get underlying environmental data for Baccharis pilularis observations.
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio12", "WC_bio12", "ER_PETseasonality", "ER_topoWet", "ER_climaticMoistureIndex")
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)
The environmental data is on a global scale. Here we crop the environmental rasters to a region of interest around the distribution of Baccharis pilularis.
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull))